Question 1
A
library(ggplot2)
library(tidyverse)
rm(list=ls())
# Define x
x = seq(0,1, length.out=100)
#x = rnorm(100,0,1)
# We choose equally spaced notes over the range [0,1]
knots = seq(0,1,length.out=8) # we set this to 8 as the first and last knots will be removed, so this ensures we are left with 6
# Basis functions per Woods formulation
b1 = function(x) {
rep(1, length(x))
}
b2 = function(x){
x
}
R_x_z = function(x, z) {
((z - 0.5)^2 - 1/12) * ((x - 0.5)^2 - 1/12) / 4 -
((abs(x - z) - 0.5)^4 - 0.5 * (abs(x - z) - 0.5)^2 + 7/240) / 24
}
# Construct the basis matrix
basis_matrix = cbind(b1(x), b2(x),sapply(knots[-c(1,length(knots))],function(k) R_x_z(x,k)))
basis_design_matrix = basis_matrix %>% as_tibble()
colnames(basis_design_matrix) = c("b1", "b2", "b3", "b4", paste0("b", 5:(4 + length(knots))))
basis_design_matrix = basis_design_matrix %>%
mutate(x = x)
basis_melted <- reshape2::melt(basis_design_matrix, id.vars = "x")
ggplot(basis_melted, aes(x = x, y = value, color = variable)) +
geom_line(size = 1) +
labs(title = "Wood Cubic Spline Basis",
x = "x", y = "Basis Function Value", color = "Basis") +
theme_minimal()

B
set.seed(123)
n <- 100
x = rnorm(n,0,1)
y <- 5 + sin(3 * pi * (x-0.6)) + rnorm(n, sd = 0.5^2)
knots <- seq(min(x), max(x), length.out = 32) # We define length of 32 because the first and last knots will be removed, to ensure we have exactly 30 knots
# Construct penalty matrix
knot_positions <- knots[-c(1, length(knots))]
n_knots = length(knot_positions)
S = matrix(0, n_knots, n_knots)
for(i in 1:n_knots){
for (j in 1: n_knots){
S[i,j] = R_x_z(knot_positions[i], knot_positions[j])
}
}
penalized_regression_spline = function(lambda, S, knot_positions, y){
basis_matrix <- cbind(
b1(x),
b2(x),
sapply(knot_positions, function(k) R_x_z(x, k))
)
P = lambda * S
X = basis_matrix[, 3:ncol(basis_matrix)]
XtX_plus_p = t(X)%*%X + P
XtY = t(X)%*%y
beta_hat = solve(XtX_plus_p)%*%XtY
fitted_values = X %*% beta_hat
H = X %*% solve(XtX_plus_p) %*% t(X)
se_errors = sqrt(diag(H))
return (list(fitted_values = fitted_values , se_errors = se_errors))
}
fit_result = penalized_regression_spline(0.00000001, S, knot_positions,y) # choose a random lambda value
plot_data <- data.frame(x = x, y = y, y_hat = fit_result$fitted_values, se_errors=fit_result$se_errors)
ggplot(plot_data, aes(x = x, y = y)) +
geom_point(color = 'black', alpha = 0.6) +
geom_line(aes(y = y_hat), color = 'blue', size = 1.2) +
labs(title = "Penalized Regression Spline Fit",
x = "X",
y = "Y") +
theme_minimal()

C
# (95% confidence intervals)
upper <- fit_result$fitted_values + 1.96 * fit_result$se_errors
lower <- fit_result$fitted_values - 1.96 * fit_result$se_errors
plot_data <- data.frame(x = x, y = y, y_hat = fit_result$fitted_values, lower = lower, upper = upper)
ggplot(plot_data, aes(x = x, y = y)) +
geom_point(color = 'black', alpha = 0.6) +
geom_line(aes(y = y_hat), color = 'blue', size = 1.2) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.1, fill = 'blue') +
labs(title = "Penalized Regression Spline Fit with Confidence Bands",
x = "X",
y = "Y") +
theme_minimal()

D
# Compute GCV as function of lambda
compute_gcv = function(lambda){
basis_matrix <- cbind(
b1(x),
b2(x),
sapply(knot_positions, function(k) R_x_z(x, k))
)
P = lambda * S
X = basis_matrix[, 3:ncol(basis_matrix)]
XtX_plus_p = t(X)%*%X + P
XtY = t(X)%*%y
beta_hat = solve(XtX_plus_p)%*%XtY
fitted_values = X %*% beta_hat
# Compute residuals
residuals = y - fitted_values
H = X %*% solve(XtX_plus_p)%*%t(X)
edf = sum(diag(H))
# Compute the GCV score
n = length(y)
gcv = sum(residuals^2)/(n*(1-edf/n)^2)
return (gcv)
}
lambda_values <- seq(0.00000001, 1, length.out = 5000)
gcv_scores <- sapply(lambda_values, compute_gcv)
gcv_data <- data.frame(lambda = lambda_values, gcv = gcv_scores)
ggplot(gcv_data, aes(x = lambda, y = gcv)) +
geom_line(color = 'blue', size = 1.2) +
labs(title = "GCV as a Function of the Smoothing Parameter",
x = "Smoothing Parameter (lambda)",
y = "GCV Score") +
theme_minimal()

Question 2
Tensor Product Spline
library(gamair)
library(tidyverse)
library(dplyr)
rm(list=ls())
# Basis functions per Woods formulation
b1 = function(x) {
rep(1, length(x))
}
b2 = function(x){
x
}
R_x_z = function(x, z) {
((z - 0.5)^2 - 1/12) * ((x - 0.5)^2 - 1/12) / 4 -
((abs(x - z) - 0.5)^4 - 0.5 * (abs(x - z) - 0.5)^2 + 7/240) / 24
}
data("brain")
brain_data = brain%>%as_tibble()%>%
select(X,Y, medFPQ)
# Calculate tensor product spline
tensor_product_spline = function(x,y, response,x_knot_positions, y_knot_positions, x_penalty_matrix, y_penalty_matrix, lambda) {
# Scale x and y
x_scaled = scale(x)
y_scaled = scale(y)
y_data = as.matrix(response)
x_basis_matrix <- cbind(sapply(x_knot_positions, function(k) R_x_z(x_scaled, k)))
y_basis_matrix <- cbind(sapply(y_knot_positions, function(k) R_x_z(y_scaled, k)))
tensor_basis_matrix = compute_tensor_product(x_basis_matrix, y_basis_matrix)
tensor_penalty_matrix = compute_tensor_product(x_penalty_matrix, y_penalty_matrix)
P = lambda * tensor_penalty_matrix
X = tensor_basis_matrix[1:nrow(y_data),]
XtX_plus_p = t(X)%*%X + P
XtY = t(X)%*%y_data
beta_hat = MASS::ginv(XtX_plus_p)%*%XtY
fitted_values = X %*% beta_hat
H = X %*% MASS::ginv(XtX_plus_p) %*% t(X)
se_errors = sqrt(diag(H))
return (list(fitted_values = fitted_values , se_errors = se_errors))
}
# Tensor product helper function
compute_tensor_product = function(A, B){
m1 = ncol(A)
m2 = ncol(B)
G = matrix(NA, nrow=nrow(A)*nrow(B), ncol = m1 * m2 )
ccol <- 1
for (j in 1:m1) {
for (k in 1:m2) {
G[, ccol] <- A[, j] * B[, k]
ccol <- ccol + 1
}
}
return (G)
}
x_knots <- seq(min(brain_data$X), max(brain$X), length.out = 10)
y_knots <- seq(min(brain_data$Y), max(brain$Y), length.out = 10)
# Construct penalty matrix
construct_penalty_matrix <- function(knots) {
n_knots <- length(knots)
S <- matrix(0, n_knots, n_knots)
for (i in 1:n_knots) {
for (j in 1:n_knots) {
S[i, j] <- R_x_z(knots[i], knots[j])
}
}
S
}
S_x = construct_penalty_matrix(x_knots)
S_y = construct_penalty_matrix(y_knots)
tensor_product_spline_result = tensor_product_spline(
brain_data$X,
brain_data$Y,
brain_data$medFPQ,
x_knots,
y_knots,
S_x,
S_y,
0.00001
)
# ggplot(brain_data, aes(x = X, y = Y)) +
# geom_point(aes(color = medFPQ), size = 2, alpha = 0.6) +
# scale_color_viridis_c() +
# labs(title = "Original Data",
# x = "X",
# y = "Y",
# color = "Observed medFPQ") +
# theme_minimal()
brain_data <- brain_data %>% mutate(Fitted = tensor_product_spline_result$fitted_values)
# Visualize the observed medFPQ values
ggplot(brain_data, aes(x = X, y = Y)) +
geom_point(aes(color = medFPQ), size = 2, alpha = 0.6) +
scale_color_viridis_c() +
labs(title = "Observed medFPQ",
x = "X",
y = "Y",
color = "Observed medFPQ") +
theme_minimal()

ggplot(brain_data, aes(x = X, y = Y)) +
geom_tile(aes(fill = Fitted)) +
scale_fill_viridis_c() +
labs(title = "Tensor Product Spline Fit",
x = "X",
y = "Y",
fill = "Fitted medFPQ") +
theme_minimal()

plot_data <- data.frame(
X = brain_data$X,
Y = brain_data$Y,
Original = brain_data$medFPQ,
Fitted = tensor_product_spline_result$fitted_values
)
plot_data_long <- tidyr::pivot_longer(plot_data,
cols = c(Original, Fitted),
names_to = "Type",
values_to = "Value")
plot_data_long$Type <- factor(plot_data_long$Type, levels = c("Original", "Fitted"))
# ggplot(plot_data_long, aes(x = X, y = Y, fill = Value)) +
# geom_tile() +
# facet_wrap(~ Type) +
# scale_fill_viridis_c() +
# labs(title = "Tensor Product Original vs Fitted Values",
# x = "X coordinate",
# y = "Y coordinate",
# fill = "medFPQ") +
# theme_minimal() +
# theme(aspect.ratio = 1)
Thin Plate Spline
library(gamair)
library(tidyverse)
library(dplyr)
rm(list=ls())
data("brain")
brain_data = brain %>% as_tibble() %>%
select(X, Y, medFPQ)
# Radial Basis function
tps_basis = function(r){
ifelse(r > 0, r^2 * log(r), 0)
}
create_design_matrix = function(data, knots){
n = nrow(data)
m = nrow(knots)
# Create matrices of coordinates
data_mat <- as.matrix(data[, 1:2])
knots_mat <- as.matrix(knots[, 1:2])
# Compute pairwise distances
distances <- fields::rdist(data_mat, knots_mat)
B <- apply(distances, c(1,2), tps_basis)
A = cbind(1, data)
X = cbind(A, B)
return (list(X = X, A = A, B = B))
}
# Compute Thin Plate Spline
thin_plate_spline = function(data, lambda, num_knots){
n = nrow(data)
knot_indices = seq(1, n, length.out = num_knots)
knots = data[knot_indices, 1:2]
design_matrix = create_design_matrix(data[,1:2], knots)
X = as.matrix(design_matrix$X)
A = design_matrix$A
B = design_matrix$B
m = ncol(B)
P = rbind(
cbind(matrix(0, 3, 3), matrix(0, 3, m)),
cbind(matrix(0, m, 3), B[knot_indices,])
)
y = as.matrix(data[, 3])
XtX_plus_p = t(X) %*% X + (lambda * P)
XtY = t(X) %*% y
beta_hat = solve(XtX_plus_p) %*% XtY
fitted_values = X %*% beta_hat
colnames(fitted_values) = "Fitted"
H = X %*% solve(XtX_plus_p) %*% t(X)
se_errors = sqrt(diag(H))
return (list(fitted_values = fitted_values, se_errors = se_errors))
}
tps_result = thin_plate_spline(brain_data, 0.5, 10)
#head(tps_result$fitted_values)
brain_data <- brain_data %>% mutate(Fitted = tps_result$fitted_values)
ggplot(brain_data, aes(x = X, y = Y)) +
geom_tile(aes(fill = Fitted)) +
scale_fill_viridis_c() +
labs(title = "Thin-Plate Spline Fit",
x = "X",
y = "Y",
fill = "Fitted medFPQ") +
theme_minimal()

plot_data <- data.frame(
X = brain_data$X,
Y = brain_data$Y,
Original = brain_data$medFPQ,
Fitted = tps_result$fitted_values
)
plot_data_long <- tidyr::pivot_longer(plot_data, cols = c(Original, Fitted), names_to = "Type", values_to = "medFPQ")
plot_data_long$Type <- factor(plot_data_long$Type, levels = c("Original", "Fitted"))
p = ggplot(plot_data_long, aes(x = X, y = Y, fill = medFPQ)) +
geom_tile() +
facet_wrap(~ Type) +
scale_fill_viridis_c() +
labs(title = "Thin-Plate Spline: Original vs Fitted Values",
x = "X",
y = "Y",
fill = "medFPQ") +
theme_minimal() +
theme(aspect.ratio = 1)
#print(p)
Tensor Product and Thin Plate Spline: Using mgcv function
library(gamair)
library(mgcv)
library(tidyverse)
library(ggplot2)
data(brain)
brain_data <- brain %>%
as_tibble() %>%
select(X, Y, medFPQ)
tp_model <- gam(medFPQ ~ te(X, Y, k = 10), data = brain_data)
tps_model <- gam(medFPQ ~ s(X, Y, bs = "tp", k = 100), data = brain_data)
summary(tp_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medFPQ ~ te(X, Y, k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.24791 0.03003 41.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(X,Y) 90.09 91.84 9.934 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.347 Deviance explained = 38.4%
## GCV = 1.5007 Scale est. = 1.4135 n = 1567
summary(tps_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medFPQ ~ s(X, Y, bs = "tp", k = 100)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.24791 0.03029 41.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(X,Y) 86.49 96.33 8.503 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.335 Deviance explained = 37.2%
## GCV = 1.5232 Scale est. = 1.4381 n = 1567
brain_data$Fitted_tp <- predict(tp_model, newdata = brain_data)
brain_data$Fitted_tps <- predict(tps_model, newdata = brain_data)
original_plot <- ggplot(brain_data, aes(x = X, y = Y, fill = medFPQ)) +
geom_tile() +
scale_fill_viridis_c() +
labs(title = "Original Data",
x = "X coordinate",
y = "Y coordinate",
fill = "medFPQ") +
theme_minimal()
original_plot

tp_plot <- ggplot(brain_data, aes(x = X, y = Y, fill = Fitted_tp)) +
geom_tile() +
scale_fill_viridis_c() +
labs(title = "Tensor-Product Spline Fit using mgcv",
x = "X coordinate",
y = "Y coordinate",
fill = "Fitted medFPQ") +
theme_minimal()
tp_plot

tps_plot <- ggplot(brain_data, aes(x = X, y = Y, fill = Fitted_tps)) +
geom_tile() +
scale_fill_viridis_c() +
labs(title = "Thin-Plate Spline Fit using mgcv",
x = "X coordinate",
y = "Y coordinate",
fill = "Fitted medFPQ") +
theme_minimal()
tps_plot

Question 3: GAMs and MARS
GAM and Logistic Regression
library(mgcv)
library(caret)
rm(list = ls())
pulsar_data = read.csv2("Pulsar.csv", header = TRUE, sep=",")
head(pulsar_data)
## Mean_Integrated SD EK Skewness Mean_DMSNR_Curve
## 1 140.5625 55.68378214 -0.234571412 -0.699648398 3.199832776
## 2 102.5078125 58.88243001 0.465318154 -0.515087909 1.677257525
## 3 103.015625 39.34164944 0.323328365 1.051164429 3.121237458
## 4 136.75 57.17844874 -0.068414638 -0.636238369 3.642976589
## 5 88.7265625 40.67222541 0.600866079 1.123491692 1.178929766
## 6 93.5703125 46.69811352 0.53190485 0.416721117 1.636287625
## SD_DMSNR_Curve EK_DMSNR_Curve Skewness_DMSNR_Curve Class
## 1 19.11042633 7.975531794 74.24222492 0
## 2 14.86014572 10.57648674 127.3935796 0
## 3 21.74466875 7.735822015 63.17190911 0
## 4 20.9592803 6.89649891 53.59366067 0
## 5 11.4687196 14.26957284 252.5673058 0
## 6 14.54507425 10.6217484 131.3940043 0
# Split the data into training and test sets (80% training, 20% test)
set.seed(10032024)
sample <- sample.int(n = nrow(pulsar_data), size = floor(.8*nrow(pulsar_data)), replace = F)
train <- pulsar_data[sample, ]
test <- pulsar_data[-sample, ]
# Separate features and target variable
X_train <- train[, !names(train) %in% "Class"]
y_train <- train$Class
X_test <- test[, !names(test) %in% "Class"]
y_test <- test$Class
# #Fit Logistic Regession Model (GLM)
# glm_model = glm(Class ~., data = train, family = binomial)
# glm_pred <- predict(glm_model, newdata = dataTest, type = "response")
# glm_class <- ifelse(glm_pred > 0.5, 1, 0)
# glm_confusion <- confusionMatrix(as.factor(glm_class), as.factor(dataTest$Class))
# glm_confusion
#
# #Fit GAM Model
# gam_model <- gam(Class ~ s(Mean_Integrated) + s(SD) + s(EK) + s(Skewness) + s(Mean_DMSNR_Curve) + s(SD_DMSNR_Curve) + s(EK_DMSNR_Curve) + s(Skewness_DMSNR_Curve), data = training_data, family = binomial)
# gam_pred <- predict(gam_model, newdata = dataTest, type = "response")
# gam_class <- ifelse(gam_pred > 0.5, 1, 0)
# gam_confusion <- confusionMatrix(as.factor(gam_class), as.factor(dataTest$Class))
# gam_confusion
MARS
library(earth)
library(caret)
library(ggplot2)
rm(list = ls())
pulsar_data <- read.csv("Pulsar.csv", header = TRUE, sep = ",")
X <- pulsar_data[, !(names(pulsar_data) %in% c("Class"))]
y <- pulsar_data$Class
# Split the data into training (80%) and testing (20%) sets
set.seed(5749)
trainIndex <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[trainIndex, ]
X_test <- X[-trainIndex, ]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]
train_data <- data.frame(X_train, target = y_train)
# Fit MARS model
mars_model <- earth(target ~ ., data = train_data)
# Predict on the test data
test_data <- data.frame(X_test)
y_pred_prob <- predict(mars_model, test_data)
y_pred <- ifelse(y_pred_prob > 0.5, 1, 0)
# Calculate the confusion matrix
conf_matrix <- confusionMatrix(as.factor(y_pred), as.factor(y_test))
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3224 68
## 1 15 272
##
## Accuracy : 0.9768
## 95% CI : (0.9713, 0.9815)
## No Information Rate : 0.905
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.855
##
## Mcnemar's Test P-Value : 1.145e-08
##
## Sensitivity : 0.9954
## Specificity : 0.8000
## Pos Pred Value : 0.9793
## Neg Pred Value : 0.9477
## Prevalence : 0.9050
## Detection Rate : 0.9008
## Detection Prevalence : 0.9198
## Balanced Accuracy : 0.8977
##
## 'Positive' Class : 0
##
# Extract performance metrics
accuracy <- conf_matrix$overall['Accuracy']
sensitivity <- conf_matrix$byClass['Sensitivity']
specificity <- conf_matrix$byClass['Specificity']
precision <- conf_matrix$byClass['Precision']
f1 <- conf_matrix$byClass['F1']
# Create a data frame to store the metrics
metrics <- data.frame(
Metric = c("Accuracy", "Sensitivity", "Specificity", "Precision", "F1 Score"),
Value = c(accuracy, sensitivity, specificity, precision, f1)
)
#print(metrics)
results <- data.frame(Actual = y_test, Predicted_Prob = y_pred_prob)
#results
Question 4: Wavelets
library(fds)
library(wavelets)
library(glmnet)
library(ggplot2)
library(wavethresh)
library(tidyr)
rm(list = ls())
data(aa)
data(ao)
aa_data <- aa$y
ao_data <- ao$y
X <- rbind(aa_data, ao_data)
y <- c(rep(0, nrow(aa_data)), rep(1, nrow(ao_data)))
# Wavelet transform function
apply_wavelet <- function(data, n_coef = 256) {
next_power_of_2 <- 2^ceiling(log2(length(data)))
padded_data <- c(data, rep(0, next_power_of_2 - length(data)))
# Compute Wavelet coefficients
wt <- wd(padded_data, filter.number = 10, family = "DaubLeAsymm")
all_levels <- 0:(nlevelsWT(wt) - 1)
all_coefs <- unlist(lapply(all_levels, function(l) accessD(wt, level = l)))
return(all_coefs[1:min(length(all_coefs), n_coef)])
}
# Apply wavelet transform to each row
X_wavelet <- t(apply(X, 1, apply_wavelet))
# Plot a sample of signals and their corresponding wavelet coefficients
plot_samples <- function(data, wavelet_data, title_signal, title_wavelet, sample_size = 10) {
sampled_indices <- sample(nrow(data), sample_size)
sampled_data <- data[sampled_indices, ]
sampled_wavelet_data <- wavelet_data[sampled_indices, ]
# Plot signals
signal_data <- as.data.frame(t(sampled_data))
signal_data$Index <- 1:nrow(signal_data)
signal_data_long <- pivot_longer(signal_data, -Index, names_to = "Sample", values_to = "Amplitude")
plot_signal <- ggplot(signal_data_long, aes(x = Index, y = Amplitude, color = Sample)) +
geom_line(alpha = 0.5) +
labs(title = title_signal, x = "Index", y = "Amplitude") +
theme_minimal()
# Plot wavelet coefficients
coeff_data <- as.data.frame(t(sampled_wavelet_data))
coeff_data$Index <- 1:nrow(coeff_data)
coeff_data_long <- pivot_longer(coeff_data, -Index, names_to = "Sample", values_to = "Coefficient")
plot_wavelet <- ggplot(coeff_data_long, aes(x = Index, y = Coefficient, color = Sample)) +
geom_line(alpha = 0.5) +
labs(title = title_wavelet, x = "Coefficient Index", y = "Coefficient Value") +
theme_minimal()
return(list(signal_plot = plot_signal, wavelet_plot = plot_wavelet))
}
# Plot for 'aa' class
aa_plots <- plot_samples(aa_data, t(apply(aa_data, 1, apply_wavelet)),
"Sample of Signals from 'aa' Class",
"Sample of Wavelet Coefficients for 'aa' Class")
print(aa_plots$signal_plot)

print(aa_plots$wavelet_plot)

# Plot for 'ao' class
ao_plots <- plot_samples(ao_data, t(apply(ao_data, 1, apply_wavelet)),
"Sample of Signals from 'ao' Class",
"Sample of Wavelet Coefficients for 'ao' Class")
print(ao_plots$signal_plot)

print(ao_plots$wavelet_plot)

# Split data
set.seed(123)
total_samples <- nrow(X_wavelet)
train_size <- floor(0.7 * total_samples)
train_indices <- sample(1:total_samples, train_size)
X_train <- X_wavelet[train_indices, ]
X_test <- X_wavelet[-train_indices, ]
y_train <- y[train_indices]
y_test <- y[-train_indices]
# Fit the model
cv_fit <- cv.glmnet(X_train, y_train, family = "binomial", alpha = 1)
best_lambda <- cv_fit$lambda.min
model <- glmnet(X_train, y_train, family = "binomial", alpha = 1, lambda = best_lambda)
# Make predictions
predictions <- predict(model, X_test, type = "response")
predicted_classes <- ifelse(predictions > 0.5, 1, 0)
# Calculate accuracy
accuracy <- mean(predicted_classes == y_test)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 1"
# Create confusion matrix
conf_matrix <- table(Predicted = predicted_classes, Actual = y_test)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 43 0
## 1 0 47
# Calculate evaluation metrics
precision <- conf_matrix[2, 2] / (conf_matrix[2, 2] + conf_matrix[2, 1])
recall <- conf_matrix[2, 2] / (conf_matrix[2, 2] + conf_matrix[1, 2])
f1_score <- 2 * precision * recall / (precision + recall)
metrics <- data.frame(
Metric = c("Accuracy", "Precision", "Recall", "F1 Score"),
Value = c(accuracy, precision, recall, f1_score)
)
print(metrics)
## Metric Value
## 1 Accuracy 1
## 2 Precision 1
## 3 Recall 1
## 4 F1 Score 1
# Get feature importances
coef_importance <- abs(coef(model)[-1])
top_features <- order(coef_importance, decreasing = TRUE)[1:10]
print("Top 10 most important wavelet coefficients:")
## [1] "Top 10 most important wavelet coefficients:"
print(top_features)
## [1] 83 19 144 88 12 75 204 142 104 37
# Plot feature importances
ggplot(data.frame(Index = 1:length(coef_importance), Importance = coef_importance),
aes(x = Index, y = Importance)) +
geom_point() +
geom_point(data = data.frame(Index = top_features, Importance = coef_importance[top_features]),
aes(x = Index, y = Importance), color = "red", size = 3) +
labs(title = "Wavelet Coefficient Importances", x = "Coefficient Index", y = "Absolute Coefficient Value") +
theme_minimal()

# Extract and plot the non-zero coefficients
non_zero_coeffs <- coef(model)[coef(model) != 0]
non_zero_indices <- which(coef(model) != 0)
coeff_data <- data.frame(Index = non_zero_indices, Coefficient = non_zero_coeffs)
# Plot the non-zero coefficients
ggplot(coeff_data, aes(x = Index, y = Coefficient)) +
geom_bar(stat = "identity") +
labs(title = "Non-zero Coefficients from Penalized Logistic Regression",
x = "Wavelet Coefficient Index",
y = "Coefficient Value") +
theme_minimal()

Question 5: Functional Data Analysis
library(gamair)
library(ggplot2)
library(splines)
rm(list = ls())
data(gas)
# Extract the spectra and octane ratings
nir_spectra <- matrix(as.numeric(unlist(gas$NIR)), nrow = 60, byrow = TRUE)
colnames(nir_spectra) <- colnames(gas$NIR)
nir_spectra <- t(nir_spectra) # Transpose to have 401 rows and 60 columns
octane <- gas$octane
# Define the B-spline basis functions
basis_functions <- bs(seq(900, 1700, by = 2), df = 20, degree = 3, intercept = TRUE)
# Define function to implement basis expansion
basis_expansion <- function(data, basis) {
expanded_data <- matrix(0, nrow = ncol(basis), ncol = ncol(data))
for (i in 1:ncol(data)) {
expanded_data[, i] <- t(basis) %*% data[, i]
}
return(expanded_data)
}
# Apply basis expansion
nir_fd_coefs <- basis_expansion(nir_spectra, basis_functions)
nir_fd_coefs_df <- as.data.frame(t(nir_fd_coefs))
colnames(nir_fd_coefs_df) <- paste0("X", 1:ncol(nir_fd_coefs_df))
model_data <- cbind(octane = octane, nir_fd_coefs_df)
# Fit the linear model
formula <- as.formula(paste("octane ~", paste(colnames(nir_fd_coefs_df), collapse = " + "), "- 1"))
lm_model <- lm(formula, data = model_data)
summary(lm_model)
##
## Call:
## lm(formula = formula, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.37 13.12 54.24 81.60 130.43
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## X1 -1907.9 1025.8 -1.860 0.0703 .
## X2 1149.6 1948.9 0.590 0.5586
## X3 -656.2 2360.8 -0.278 0.7825
## X4 949.0 1831.7 0.518 0.6072
## X5 570.8 2122.2 0.269 0.7893
## X6 -1459.1 1701.9 -0.857 0.3964
## X7 -190.4 2276.4 -0.084 0.9338
## X8 -595.2 2231.0 -0.267 0.7910
## X9 1507.7 2110.0 0.715 0.4790
## X10 426.0 3048.5 0.140 0.8896
## X11 1389.5 2558.7 0.543 0.5901
## X12 -3365.9 2972.6 -1.132 0.2643
## X13 -132.3 2817.8 -0.047 0.9628
## X14 -351.4 2508.1 -0.140 0.8893
## X15 4001.0 2626.5 1.523 0.1356
## X16 -2283.2 2597.2 -0.879 0.3846
## X17 744.2 2049.0 0.363 0.7184
## X18 -2820.2 2314.7 -1.218 0.2302
## X19 2217.7 1714.6 1.293 0.2033
## X20 889.6 913.6 0.974 0.3360
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 83.7 on 40 degrees of freedom
## Multiple R-squared: 0.3856, Adjusted R-squared: 0.07842
## F-statistic: 1.255 on 20 and 40 DF, p-value: 0.2637
# First principles prediction using the fitted coefficients
fitted_coefficients <- coef(lm_model)
octane_pred_manual <- as.matrix(nir_fd_coefs_df) %*% fitted_coefficients
results <- data.frame(Actual = octane, Predicted = octane_pred_manual)
# Plot actual vs predicted values
ggplot(results, aes(x = Actual, y = Predicted)) +
geom_point(color = 'blue') +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "Actual vs Predicted Octane Ratings",
x = "Actual Octane Rating",
y = "Predicted Octane Rating") +
theme_minimal()

print(fitted_coefficients)
## X1 X2 X3 X4 X5 X6 X7
## -1907.9197 1149.6367 -656.1988 948.9994 570.8192 -1459.0755 -190.3914
## X8 X9 X10 X11 X12 X13 X14
## -595.2310 1507.6761 426.0391 1389.5247 -3365.8842 -132.2741 -351.4226
## X15 X16 X17 X18 X19 X20
## 4000.9962 -2283.1757 744.1567 -2820.2456 2217.7276 889.6180
nir_spectra_df <- as.data.frame(nir_spectra)
nir_spectra_df$Wavelength <- seq(900, 1700, by = 2)
nir_spectra_long <- tidyr::pivot_longer(nir_spectra_df, cols = -Wavelength, names_to = "Sample", values_to = "log1R")
ggplot(nir_spectra_long, aes(x = Wavelength, y = log1R, group = Sample, color = Sample)) +
geom_line(alpha = 0.5) +
labs(title = "NIR Spectra",
x = "Wavelength (nm)",
y = "log(1/R)") +
theme_minimal() +
theme(legend.position = "none")
